# -*- coding: utf-8 -*-
"""
This file creates the section-level data used in The Deaths of Ideas in Congress
First created: 6/30/22
Final edit: 12/13/23
"""

import os
import pandas as pd
import numpy as np

cong=[103,104,105,106,107,108, 109, 110, 111, 112, 113, 114]

#Create dataframes for all sections, reintroduced sections, and enacted sections. 
secs_all = pd.DataFrame() 
reintro = pd.DataFrame()
enacted = pd.DataFrame()

for i in cong:
   #Bring in all sections 
   num = str(i)
   new_secs = pd.read_csv('/replication/Cleaned Sections/'+num+'/'+num+'_sections_noboilerplate.csv')
   secs_all = secs_all.append(new_secs)
   secs_all = secs_all.drop(['Unnamed: 0'], axis=1)
   new_secs=[]
   secs_all = secs_all.loc[(secs_all['bill'].str.contains("ih", na=False)) | (secs_all['bill'].str.contains("is", na=False))] #Keep introduced sections

   #Import reintroduced sections
   os.chdir('/replication/Reintroduced')
   new_reintro=pd.read_csv('secs'+num+'_reintroduced.csv')

   #Create identical and near identical robustness columns
   new_reintro.loc[(new_reintro['txt1_ad_5']==1) & (new_reintro['txt2_ad_5']==1) & (new_reintro['perblock_txt1']==1) & (new_reintro['perblock_txt2']==1) & (new_reintro['first_txt1_ad_2']==1) & (new_reintro['first_txt2_ad_2']==1) , 'identical']=1
   new_reintro.loc[(new_reintro['txt1_ad_5']>=0.95) & (new_reintro['txt2_ad_5']>=0.95) & (new_reintro['perblock_txt1']>=0.95) & (new_reintro['perblock_txt2']>=0.95) & (new_reintro['first_txt1_ad_2']>=0.95) & (new_reintro['first_txt2_ad_2']>=0.95) , 'near_identical']=1
   new_reintro["identical"] = np.where(new_reintro["identical"] >0, 1, 0)
   new_reintro["near_identical"] = np.where(new_reintro["near_identical"] >0, 1, 0)
   
   #Group reintroduced sections by the section to create a single section row for whether it is was reintroduced or not. 
   reintro_secs = new_reintro.groupby(new_reintro['sec1'])[('match', 'identical', 'near_identical')].sum()
   reintro_secs['section'] = reintro_secs.index    
   reintro_secs=reintro_secs.rename(columns={"match":"reintro_match", "identical":'reintro_identical', 'near_identical':'reintro_near_identical'})
   reintro = reintro.append(reintro_secs, sort=True)

   #Import enacted sections 
   os.chdir('/replication/Enacted Sections')
   new_enacted=pd.read_csv('secs'+num+'_enacted.csv')
   enacted_secs = new_enacted.groupby(new_enacted['sec1']).count() #Collapse sections into single file for each sec
   enacted_secs['section'] = enacted_secs.index    
   enacted_secs= enacted_secs[['section', 'match']]
   enacted_secs=enacted_secs.rename(columns={"match":"enacted_match"})
   enacted = enacted.append(enacted_secs, sort=True)
   
   #Create df of unenacted sections
   unenacted = pd.merge(secs_all, enacted, on='section', how='left')
   unenacted = unenacted[unenacted['enacted_match'].isnull()]

   #Create DV of reintroduced or not
   dv = pd.merge(unenacted, reintro, on='section', how='left')

   #Create DV column
   dv['reintro_match'] = dv['reintro_match'].fillna(0)
   dv['reintroduced']=dv['reintro_match']
   dv['reintroduced'] = dv['reintroduced'].astype(int)
   dv["reintroduced"] = np.where(dv["reintroduced"] >0, 1, 0)
   dv["death"] = np.where(dv["reintroduced"] >0, 0, 1)
   dv["reintro_identical"] = np.where(dv["reintro_identical"] >0, 1, 0)
   dv["reintro_near_identical"] = np.where(dv["reintro_near_identical"] >0, 1, 0)
   dv['billid']=dv['section'].str.split('_').str[0]
   
   #Complexity variable
   dv['sec_complexity']=dv['clean_text'].str.len()
   dv['sec_complexity'] = dv['sec_complexity'].fillna(30000) #Limit for inclusion was 30000 character length to fit in the CSV file, so this is an artificial ceiling given that original limit. 
   dv["death"].describe()
   
   #Create variable to remove tariff and ceremonial legislation
   dv.loc[dv['clean_text'].str.contains("harmonized tariff", na=False), 'tariff']=1 #Tariffs
   dv.loc[dv['clean_text'].str.contains("facility united states postal service", na=False), 'post_office']=1 #Post office namings
   dv.loc[dv['clean_text'].str.contains("coin specifications", na=False), 'coins']=1 #Minting ceremonial coins
   dv.loc[dv['clean_text'].str.contains("gold medal", na=False), 'gold_medal']=1 #Minting gold medals
   dv.loc[dv['clean_text'].str.contains("known designated", na=False), 'facility']=1 #Facility namings
   dv['facility'] = dv['facility'].fillna(0)
   dv['coins'] = dv['coins'].fillna(0)
   dv['tariff'] = dv['tariff'].fillna(0)
   dv['post_office'] = dv['post_office'].fillna(0)
   dv['gold_medal'] = dv['gold_medal'].fillna(0)

#Get sponsor and policy agendas code for each bill
cbp=pd.read_csv('/replication/bills102-114.csv')
cbp = cbp.reset_index(drop=True)
cbp_slim = cbp[['cong','billid', 'cosponsr', 'pooleid', 'impbill', 'major', 'minor', 'mref', 'chref', 'rankref', 'subchref', 'subrankref']] 
cbp_slim['billid'] = cbp_slim['billid'].str.replace('-', '')
cbp_slim['billid'] = cbp_slim['billid'].str.lower()
cbp_slim=cbp_slim.rename(columns={"pooleid":"icpsr", 'cong':'Congress'})

#Merge Congressional bills project data with section-level data
dv = pd.merge(dv, cbp_slim, on='billid', how='left')

#Appropriations bills
approps=pd.read_excel('/replication/appropriations_bills.xlsx')
dv = pd.merge(dv, approps, on='billid', how='left')
dv["approps"] = np.where(dv["approps"] >0, 1, 0)
dv["approps"].describe()

#Senate bills
dv.loc[dv['billid'].str.contains('s', na=False), 'senate'] = 1
dv["senate"] = np.where(dv["senate"] >0, 1, 0)

#Create companion bills and momentum variables
comp_all = pd.DataFrame()
for i in cong:
    num = str(i) 
    companion=pd.read_csv('/replication/samecong_matches'+num+'.csv')
    companion = companion.sort_values(['bill1', 'date1'])
    companion.loc[(companion['bill1'].str.contains("ih", na=False) & companion['bill2'].str.contains("is", na=False)) | (companion['bill1'].str.contains("is", na=False) & companion['bill2'].str.contains("ih", na=False)), 'companion'] = 1
    companion["companion"] = np.where(companion["companion"] >0, 1, 0)
    companion = companion.sort_values(['bill1', 'date1'])
    companion.loc[(companion['bill1'].str.contains("ih", na=False) & companion['bill2'].str.contains("ih", na=False)) | (companion['bill1'].str.contains("is", na=False) & companion['bill2'].str.contains("is", na=False)), 'samechamber_duplicate'] = 1
    companion["samechamber_duplicate"] = np.where(companion["samechamber_duplicate"] >0, 1, 0)
    companion.loc[((companion['bill1'].str.contains("ih", na=False) | companion['bill1'].str.contains("is", na=False))) & ((~companion['bill2'].str.contains("ih", na=False)) | (~companion['bill2'].str.contains("is", na=False))), 'momentum']=1   
    companion["momentum"] = np.where(companion["momentum"] >0, 1, 0)
    companion2 = companion[['sec1', 'companion', 'momentum', 'samechamber_duplicate']]
    companion2 = companion2.rename(columns={'sec1':'section'})
    companion2=companion2.groupby('section')['momentum', 'companion', 'samechamber_duplicate'].sum()
    companion2.reset_index(inplace=True)
    comp_all = comp_all.append(companion2)

dv = pd.merge(dv, comp_all, on='section', how='left')
dv["momentum"] = np.where(dv["momentum"] >0, 1, 0)
dv["companion"] = np.where(dv["companion"] >0, 1, 0)
dv['samechamber_duplicate'] = np.where(dv['samechamber_duplicate']>0, 1, 0)

dv.loc[(dv['samechamber_duplicate']==1) & (dv.section.isin(companion['sec2'])), 'not_firstsamechamb']=1
dv['not_firstsamechamb'] = np.where(dv['not_firstsamechamb'] >0, 1, 0)
dv['not_firstsamechamb'].describe()

#Take sec2 that are companions and match to sec1 and omit 
dv.loc[(dv['companion']==1) & (dv.section.isin(companion['sec2'])), 'not_firstcompanion']=1
dv['not_firstcompanion'] = np.where(dv['not_firstcompanion'] >0, 1, 0)
dv['not_firstcompanion'].describe()

dv["not_first"] = np.where((dv["not_firstsamechamb"]>0) | (dv["not_firstcompanion"]>0), 1, 0)
dv['not_first'].describe()
dv['not_first'].sum()

#Determine if ICPSR or bioname was in next term
#Import LES data and use Legislator name
les_h=pd.read_excel('/replication/CELHouse93to116.xlsx') #House LES
les_h = les_h.rename(columns={"ICPSR number, according to Poole and Rosenthal": "icpsr"})
les_h = les_h.rename(columns={"Congress number": "Congress"})
les_h = les_h.sort_values(['Legislator name, as given in THOMAS', 'Congress'])
les_h['nextterm']= les_h.groupby('Legislator name, as given in THOMAS')['Legislator name, as given in THOMAS'].shift(periods=-1)
les_h['next_term2'] = les_h['Legislator name, as given in THOMAS'] == les_h['nextterm']
les_h['next_term2']=les_h['next_term2'].astype(int)

les_h_slim = les_h[['Congress', "icpsr", 'Two-letter state code', '1 = Democrat', 'Name in bioguide', 'First-dimension DW-NOMINATE score', '1 = female',
                  'Number of substantive bills sponsored', '1 = majority party member', 'Absolute distance from floor median (DW-NOMINATE)', 
                  'Absolute distance from majority party median (DW-NOMINATE)', 'Legislative Effectiveness Score (1-5-10)', 
                  'Lagged LES', 'Seniority, number of terms served counting current', 'next_term2', '1 = subcommittee chair (or vice chair), according to Almanac of American Politic', 
                  'Percent vote received to enter this Congress', '1 = Below, 2 = Meets, 3 = Exceeds']]

#Rename LES variables
les_h_slim = les_h_slim.rename(columns={'Two-letter state code': 'state', '1 = Democrat': 'democrat', 'Name in bioguide':'sponsor', 'First-dimension DW-NOMINATE score':'dwnom1', 
                              '1 = female':'female', 'Number of substantive bills sponsored':'bills_sponsored', '1 = majority party member':'majority', 
                              'Absolute distance from floor median (DW-NOMINATE)':'ideodist_floormedian','Absolute distance from majority party median (DW-NOMINATE)':'ideodist_partymedian', 
                              'Legislative Effectiveness Score (1-5-10)':'les_score', 'Lagged LES':'lag_lesscore', 'Seniority, number of terms served counting current': 'seniority', 
                              '1 = subcommittee chair (or vice chair), according to Almanac of American Politic': 'subchair', 'Percent vote received to enter this Congress': 'prev_voteshare', '1 = Below, 2 = Meets, 3 = Exceeds':'les_benchmark'})

#Senate LES
les_s=pd.read_excel('/replication/CELSenate93to116.xlsx')
les_s = les_s.rename(columns={"icpsr number": "icpsr"})
les_s = les_s.rename(columns={"congress number": "Congress"})
les_s = les_s.sort_values(['Name in bioguide', 'Congress'])
les_s['nextterm']= les_s.groupby('Name in bioguide')['Name in bioguide'].shift(periods=-1)
les_s['next_term2'] = les_s['Name in bioguide'] == les_s['nextterm']
les_s['next_term2']=les_s['next_term2'].astype(int)

les_s_slim = les_s[['Congress', "icpsr", 'two letter state abbreviation', 'Name in bioguide', 'Number of substantive bills sponsored by this senator', 
                    '1 if senator is democrat', '1 if senator is in majority party', '1 if senator is female',
                   'vote share in last election', '1 if senator is a subcommittee chair', 'seniority', 'legislative effectiveness score', 
                   'lagged legislative effectiveness score', '1 if freshman senator', 'First dimensions dw-nominate score', 'Absolute distance from floor median (dwnom1)',
                   'Absolute distance from majority-party median (dwnom1)','next_term2', '1 = Below, 2 = Meets, 3 = Exceeds']]

#Rename LES variables
les_s_slim = les_s_slim.rename(columns={'two letter state abbreviation': 'state', 'Name in bioguide': 'sponsor', 'Number of substantive bills sponsored by this senator': 'bills_sponsored', 
                    '1 if senator is democrat':'democrat', '1 if senator is in majority party':'majority', '1 if senator is female':'female',
                   'vote share in last election':'prev_voteshare', '1 if senator is a subcommittee chair':'subchair', 'legislative effectiveness score':'les_score', 
                   'lagged legislative effectiveness score':'lag_lesscore','Absolute distance from floor median (dwnom1)':'ideodist_floormedian',
                   'Absolute distance from majority-party median (dwnom1)':'ideodist_partymedian',
                   'First dimensions dw-nominate score':'dwnom1', '1 if freshman senator':'freshman', '1 = Below, 2 = Meets, 3 = Exceeds':'les_benchmark'})


les_slim = les_h_slim.append(les_s_slim, sort=True) #Append House and Senate LES data
les_slim['les_benchmarklag']= les_slim.groupby('sponsor')['les_benchmark'].shift(periods=1)

#Senate committee and party leader data
os.chdir('/replication')
senate_comm = pd.read_excel("Senate_assignments_103-117.xlsx")
senate_comm = senate_comm.rename(columns={"ID #": "icpsr"})
s_comm_lead = senate_comm[(senate_comm["Senior Party Member"] >= 11) & (senate_comm["Senior Party Member"] <= 16)  | (senate_comm["Senior Party Member"]>=21) & (senate_comm["Senior Party Member"]<=24)]
s_comm_lead = s_comm_lead[['Congress', 'icpsr', "Senior Party Member"]]
s_comm_lead = s_comm_lead.rename(columns={"Senior Party Member": "committee_leader"})

s_cham_lead=senate_comm[(senate_comm["Senior Party Member"] >= 31) & (senate_comm["Senior Party Member"] <= 53)  | (senate_comm["Senior Party Member"]>=61) & (senate_comm["Senior Party Member"]<=66)]
s_party_lead = s_cham_lead[['Congress', 'icpsr', "Senior Party Member"]]
s_party_lead = s_party_lead.rename(columns={"Senior Party Member": "party_leader"})

#House committee and party leader data
house_comm = pd.read_excel("House_assignments_103-117.xls")
house_comm = house_comm.rename(columns={"ID #": "icpsr"})
h_comm_lead = house_comm[(house_comm["Senior Party Member"] >= 11) & (house_comm["Senior Party Member"] <= 16)  | (house_comm["Senior Party Member"]>=21) & (house_comm["Senior Party Member"]<=24)]
h_comm_lead = h_comm_lead[['Congress', 'icpsr', "Senior Party Member"]]
h_comm_lead = h_comm_lead.rename(columns={"Senior Party Member": "committee_leader"})

h_cham_lead=house_comm[(house_comm["Senior Party Member"] >= 31) & (house_comm["Senior Party Member"] <= 53)  | (house_comm["Senior Party Member"]>=61) & (house_comm["Senior Party Member"]<=66)]
h_party_lead = h_cham_lead[['Congress', 'icpsr', "Senior Party Member"]]
h_party_lead = h_party_lead.rename(columns={"Senior Party Member": "party_leader"})

#Merge party and committee leader into member data
committee_lead = h_comm_lead.append(s_comm_lead)
party_lead = h_party_lead.append(s_party_lead)

les_slim = pd.merge(les_slim, committee_lead, on=('Congress', 'icpsr'), how='left')
les_slim["committee_leader"] = np.where(les_slim["committee_leader"] >0, 1, 0)

les_slim = pd.merge(les_slim, party_lead, on=('Congress', 'icpsr'), how='left')
les_slim["party_leader"] = np.where(les_slim["party_leader"] >0, 1, 0)

#Lag committee leader/party leader
les_slim = les_slim.sort_values(['sponsor', 'Congress'])
les_slim['l.com_leader']= les_slim.groupby('sponsor')['committee_leader'].shift(periods=1)
les_slim['lag_commleader'] = les_slim['committee_leader'] - les_slim['l.com_leader']
les_slim['lag_commleader']= les_slim['lag_commleader'].fillna(0)
les_slim['lag_commleader']=les_slim['lag_commleader'].astype(int)
les_slim = les_slim.sort_values(['sponsor', 'Congress'])

les_slim['l.party_leader']= les_slim.groupby('sponsor')['party_leader'].shift(periods=1)
les_slim['lag_partyleader'] = les_slim['party_leader'] - les_slim['l.party_leader']
les_slim['lag_partyleader']= les_slim['lag_partyleader'].fillna(0)

les_slim['l.majority']= les_slim.groupby('sponsor')['majority'].shift(periods=1)
les_slim['lag_majority'] = les_slim['majority'] - les_slim['l.majority']

#Subset to exclude freshmen becaue the LES score is lagged 
member_data = les_slim[les_slim['seniority']>1]

###Create data for enacted policies from previous Congress
#Get billid into enacted section
enacted['billid']=enacted['section'].str.split('_').str[0]

#Get PAP codes attached to them
#Group by pap codes and count sections
issue_area = pd.merge(enacted, cbp_slim, how='left', on='billid')
issue_area_minor = issue_area.groupby(['minor', 'Congress']).count().reset_index()
issue_area_minor = issue_area_minor[['minor', 'section', 'Congress']]
issue_area_minor=issue_area_minor.rename(columns={"section":"minor_count"})
issue_area_minor = issue_area_minor.sort_values(['minor', 'Congress'])
issue_area_minor = issue_area_minor.set_index('minor')
issue_area_minor['Congress']= issue_area_minor['Congress'].astype(int)
issue_area_minor = (issue_area_minor.set_index('Congress',append=True)
                 .reindex(pd.MultiIndex.from_product([issue_area_minor.index.unique(),
                                                      range(issue_area_minor.Congress.min(),issue_area_minor.Congress.max()+1)],
                                                     names=['minor','Congress']))
                 .reset_index(level=1))
issue_area_minor.reset_index(inplace=True)
issue_area_minor['minor_count'] = issue_area_minor['minor_count'].fillna(0)
issue_area_minor['lag_minor']= issue_area_minor.groupby(['minor'])['minor_count'].shift(periods=1)

issue_area_major = issue_area.groupby(['major', 'Congress']).count().reset_index()
issue_area_major = issue_area_major[['major', 'section', 'Congress']]
issue_area_major = issue_area_major.rename(columns={"section":"major_count"})
issue_area_major = issue_area_major.sort_values(['major', 'Congress'])

issue_area_major['lag_major']= issue_area_major.groupby(['major'])['major_count'].shift(periods=1)


dv = pd.merge(dv, issue_area_minor, how="left", on=("minor", "Congress")) #Merge lagged minor PAP data into main df
dv = pd.merge(dv, issue_area_major, how="left", on=("major", "Congress")) #Merge lagged major PAP data into main df

#Create and merge bill size variable into df
bill_size = dv.groupby('bill').count().reset_index() 
bill_size = bill_size[['bill', 'section']]
bill_size=bill_size.rename(columns={"section":"bill_section_count"})
dv = pd.merge(dv, bill_size, how="left", on="bill")

#Merge institutional-level variables into df
cong_vars = pd.read_excel('/replication/Institutional_Variables.xlsx')
dv = pd.merge(dv, cong_vars, how="left", on="Congress")
dv = dv.dropna(subset=['icpsr'])
full_data = pd.merge(dv, member_data, how="left", on=("Congress", "icpsr"))
full_data.drop(['clean_text', 'original_text',], axis=1, inplace=True)

duplicates = full_data[full_data.duplicated(['section'])] #Identify duplicated sections and remove
full_data = full_data.drop_duplicates(subset=['section'], keep='first')

full_data.to_csv(r'\replication\deathideas_data.csv') #Export data

